# Packages used in the ingestion of data
library(httr) # Wrapper for the curl package
library(XML) # Dealing with text in xml
library(jsonlite) # Getting and formatting JSON
library(rplos) # API to PLOS for journals
library(readr) # Reading rectangular data
library(plyr) # Wrangling data
library(dplyr) # Wrangling data
library(tibble) # Tidier table object
library(tidyr) # Tidying data
library(stringr) # Manipulating strings using RegEx
library(purrr)
library(tm) # Text mining
library(rjson) # Work with JSON format
library(fulltext) # Text mining of open access journals
# Packages used for special use cases
library(stats)
library(topicmodels) # for LDA topic modelling
library(SnowballC) # for stemming
library(tidyverse) # general utility & workflow functions
library(tidytext) # text analysis in R
library(lubridate) # working with times and dates
library(rvest) # Download and manipulate HTML
# Packages used to prettify and produce visualizations
library(kableExtra) #
library(rjson) # Work with JSON format
library(ggplot2) # graphs library
library(purrr) #
library(cowplot) # ggplot2 add-on for pretty graphsAssessment of the state of science is a requirement for research. Assessment of intellectual property is a requirement for capitalization of knowledge for profit. This project explores tools for surveying patent data and research data in order to develop methods for making these assessments.
The project begins with an examination of data from NASA, whose open data portal includes a dataset of NASA patents. The format is a delimited text file. The data include a patent case number and title. Exploration of this dataset may include unsupervised learning to yield topics of patents.
The World Intellectual Property Organization (WIPO) publishes The WIPO Manual on Open Source Patent Analytics with instruction for data interrogation useful for subsequent parts of this project. The project will continue with a more detailed exploration of patent cases opened by NASA or another science-focused organization with a view to revealing more topics from within patent case data other than the title of the case, perhaps from the patent application, for example.
There are at least two data sources for patent data. The primary data source is the U.S. Patent and Trademark Office (USPTO), whose Open Data Portal publishes many datasets in various formats. USPTO also supplies APIs for patent data retrieval. Besides these APIs, there is at least one potentially useful third party API called PatentsView.
One resource for research publications is the Public Library of Science (PLOS), who publish open access peer reviewed articles. The inventory of articles seems limited, but appears to enable an introduction to searching for research papers for which we will utilize the R package, rplos, a programmatic interface to the Solr based search API provided by PLOS. Solr is an open source search platform built on Apache Lucerne.
Our research topic is quantification and characterization of NASA innovation through inspection of NASA’s patents. For example, we expect to count patents which cite NASA patents (quantification) and to mine topics from NASA patents (characterization). We will report and visualize these findings.
The motivation for this project relates to methods for Information Retrieval and the team’s collective interest and experience in startup and scientific industries. Efficient topic modelling can form the foundation of a number of advanced NLP techniques such as Word sense disambiguation, Natural Text Generation, and automatic Summarization on its own. A number of derivative analysis are possible based on NLP, including studies combining our results with industry or innovation metrics.
readr blog post. Illustrates the approach, although this syntax doesn’t work anymore. The approach writes the specification to a file.nasa_patents_fn <- "NASA_Patents.csv"
# Specify datatypes.
nasa_spec <- spec_csv(file = file.path("data", nasa_patents_fn))## Parsed with column specification:
## cols(
## Center = col_character(),
## Status = col_character(),
## `Case Number` = col_character(),
## `Patent Number` = col_character(),
## `Application SN` = col_character(),
## Title = col_character(),
## `Patent Expiration Date` = col_character()
## )
nasa_spec$cols[["Status"]] <- col_factor()
nasa_spec$cols[["Patent Expiration Date"]] <- col_date("%m/%d/%Y")
# Read patent file.
nasa_patents <- readr::read_csv(file = file.path("data", nasa_patents_fn),
col_types = nasa_spec)
# In variable names, replace spaces.
names(nasa_patents) <- str_replace_all(string = names(nasa_patents),
pattern = " ",
replacement = "_")
# Transform the application number to USPTO format
nasa_patents <- nasa_patents %>%
mutate(Application_SN_uspto = str_c("US",
str_replace_all(Application_SN, "[[:punct:]]", "")))# Example https://developer.uspto.gov/ibd-api/v1/patent/application?applicationNumber=US12795356'
is_na_patnum <- is.na(nasa_patents$Patent_Number)
is_patnum <- nasa_patents$Patent_Number > 0
nasa_patent_numbers <- nasa_patents$Patent_Number[!is_na_patnum & is_patnum]
# Need to review contents of JSON files.
for (app in nasa_patents$Application_SN_uspto) {
#get_patent_application_uspto(app)
url <- paste0('https://developer.uspto.gov/ibd-api/v1/patent/application?applicationNumber=',app)
path <- "data/uspto"
file_path <- file.path(path, str_c(app, ".html"))
if (!file.exists(file_path)) {
patent_application <- jsonlite::fromJSON(url)
#test
print(patent_application)
# Bulk download not ideal
##download.file(patent_application$response$docs$archiveUrl, file.path(path))
# Instead use the following form to query for new HTML files
# http://patft.uspto.gov/netahtml/PTO/index.html
break
}
}## $response
## $response$numFound
## [1] 2
##
## $response$start
## [1] 0
##
## $response$docs
## applicationType documentId applicationNumber documentType
## 1 UTILITY US20110212334A1 US13033085 application
## 2 UTILITY US8623253B2 US13033085 grant
## publicationDate documentDate productionDate
## 1 2011-09-01T00:00:00Z 2011-09-01T00:00:00Z 2011-08-17T00:00:00Z
## 2 2014-01-07T00:00:00Z 2014-01-07T00:00:00Z 2013-12-24T00:00:00Z
## applicationDate
## 1 2011-02-23T00:00:00Z
## 2 2011-02-23T00:00:00Z
## applicant
## 1 Jolley, Scott T., Gibson, Tracy L., Williams, Martha K., Parrish, Clyde F., Parks, Steven L.
## 2 Jolley, Scott T., Gibson, Tracy L., Williams, Martha K., Parrish, Clyde F., Parks, Steven L.
## inventor
## 1 Jolley, Scott T., Gibson, Tracy L., Williams, Martha K., Parrish, Clyde F., Parks, Steven L.
## 2 Jolley, Scott T., Gibson, Tracy L., Williams, Martha K., Parrish, Clyde F., Parks, Steven L.
## assignee
## 1 United States of America as Represented by the Administrator of the National aeronautics and, Space Administration
## 2 The United States of America as Represented by the Administrator of the National Aeronautics and Space Administration
## title
## 1 Low-Melt Poly(amic Acids) and Polyimides and Their Uses
## 2 Low-Melt Poly(amic Acids) and Polyimides and Their Uses
## archiveUrl
## 1 https://bulkdata.uspto.gov/data/patent/application/redbook/fulltext/2011/ipa110901.zip
## 2 https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/2014/ipg140107.zip
## pdfPath year _version_ patentNumber
## 1 NOTUPDATED 2011 1.622101e+18 <NA>
## 2 NOTUPDATED 2014 1.626650e+18 8623253
## [1] "Downloads complete."
Exploratory chunk disabled.
# explore downloaded data
raw.html <- read_html("data/6020587.html")
# title
raw.html %>% html_nodes("body search-app article h1") %>% html_text() %>% .[[1]]
# filing date
raw.html %>% html_nodes("[itemprop='filingDate']") %>% html_text() %>% .[[2]]
# abstract
raw.html %>% html_nodes("abstract") %>% html_text()
# description > origin of the invention
raw.html %>% html_nodes("body search-app article section div div p") %>% html_text() %>% .[[1]]
# description > background of the invention > field of the invention
raw.html %>% html_nodes("body search-app article section div div p") %>% html_text() %>% .[[3]]
# cited by
raw.html %>% html_nodes(xpath="/html/body/search-app/article/table") %>% html_table()
# cited by (count) nrow()
cited_by <- raw.html %>% html_nodes(xpath="/html/body/search-app/article/table") %>% html_table()
# patent citations
raw.html %>% html_nodes(xpath="/html/body/search-app/article/section/table") %>% html_table()
raw.html %>% html_nodes(xpath="/html/body/search-app/article/section/h2") %>% html_text()# create object with all filenames
files <- list.files(path="data", pattern="[0-9]{5,}.html", full.names=TRUE, recursive=FALSE)
# create lists of extracted data
# title
raw.title <- lapply(files, function(x) {
raw.html <- read_html(x, header=TRUE) # load file
# apply function
raw.html %>% html_nodes("body search-app article h1") %>% html_text() %>% .[[1]] %>% str_remove(" \\n +\\- [[:alpha:]]{6,} [[:alpha:]]{6,}")
})
# abstract
raw.abstract <- lapply(files, function(x) {
raw.html <- read_html(x, header=TRUE) # load file
# apply function
raw.html %>% html_nodes("abstract") %>% html_text()
})
# prior art date as proxy for filing date that includes support 1899 example
filing.date <- lapply(files, function(x) {
raw.html <- read_html(x, header=TRUE) # load file
# apply function
raw.html %>% html_nodes("[itemprop='priorArtDate']") %>% html_text() %>% .[[1]]
})
# description > origin of the invention
raw.origin <- lapply(files, function(x) {
raw.html <- read_html(x, header=TRUE) # load file
# apply function
raw.html %>% html_nodes("body search-app article section div div p") %>% html_text() %>% .[[1]]
})
# description > background of the invention > field of the invention
raw.background <- lapply(files, function(x) {
raw.html <- read_html(x, header=TRUE) # load file
# apply function
raw.html %>% html_nodes("body search-app article section div div p") %>% html_text() %>% .[[3]]
})We observe that each patent includes citations to reference other patents, as well as those other patents that cite it. We then ask if it might be possible to identify counts of each to demonstrate, for instance, the count of citations over time
This analysis will help us begin to answer the question, “how, and to what extent, do patents build on other patents?” Subsequent analysis should account for patent density across time, as well as the calculation of a rough proxy metric of the referenciblity of each patent (how much more or less is a given patent cited?).
# cited by (count)
cited_by <- lapply(files, function(x) {
raw.html <- read_html(x, header=TRUE) # load file
# apply function
raw.html %>% html_nodes(xpath="/html/body/search-app/article/section/h2") %>% html_text() %>% str_subset(pattern = "Cited")
})
# clean output
cited_by <- as.character(cited_by)
# cited by (count)
count.cited_by <- lapply(cited_by, function(x)
# remove up to digit
x %>% str_remove("\\D+")
)
count.cited_by <- lapply(count.cited_by, function(x)
# remove after digit
x %>% str_remove("\\)$")
)
count.cited_by <- lapply(count.cited_by, function(x)
# remove after digit
x %>% as.integer()
)
# citations (count)
citations <- lapply(files, function(x) {
raw.html <- read_html(x, header=TRUE) # load file
# apply function
raw.html %>% html_nodes(xpath="/html/body/search-app/article/section/h2") %>% html_text() %>% str_subset(pattern = "Patent Citations")
})
# drop Non-Patent Citations
citations <- lapply(citations, function(x) {
# apply function
str_detect(x, "Non-") %>%
discard(x, .)
})
# clean output
citations <- as.character(citations)
# cited by (count)
count.citations <- lapply(citations, function(x)
# remove up to digit
x %>% str_remove("\\D+")
)
count.citations <- lapply(count.citations, function(x)
# remove after digit
x %>% str_remove("\\)$")
)
count.citations <- lapply(count.citations, function(x)
# remove after digit
x %>% as.integer()
)# not necessary, but makes lists nicer to read
raw.title <- unlist(raw.title, recursive = FALSE)
raw.abstract <- unlist(raw.abstract, recursive = FALSE)
raw.origin <- unlist(raw.origin, recursive = FALSE)
filing.date <- filing.date %>% unlist(recursive = FALSE)
raw.background <- unlist(raw.background, recursive = FALSE)
count.citations <- count.citations %>% unlist(recursive = FALSE)
count.cited_by <- count.cited_by %>% unlist(recursive = FALSE)
# get "appId" with regex
app_num <- unlist(str_extract_all(raw.title, "[A-Z]{2,}[0-9]{6,}[A-Z]+[0-9]?"))
# remove "appId" from "title"
raw.title <- str_remove(raw.title, app_num)
raw.title <- str_remove(raw.title, " - ")
# bind lists into dataframe and name columns
raw.df <- setNames(do.call(rbind.data.frame, Map('c', app_num, raw.title, raw.abstract, raw.origin, raw.background, count.cited_by, count.citations, filing.date)), c("appId", "title", "abstract", "origin", "background", "citedBy", "citations", "filingDate"))## Warning in mapply(FUN = f, ..., SIMPLIFY = FALSE): longer argument not a
## multiple of length of shorter
## appId
## 1 US464750A
## 2 US5585083A
## 3 US5606014A
## 4 US5617873A
## 5 US5632841A
## title
## 1 Button-hole-scissors gage - Google Patents
## 2 Catalytic process for formaldehyde oxidation
## 3 Imide oligomers and co-oligomers containing pendent phenylethynyl groups and polymers therefrom
## 4 Non-invasive method and apparatus for monitoring intracranial pressure and pressure volume index in humans
## 5 Thin layer composite unimorph ferroelectric driver and sensor
## abstract
## 1 Disclosed is a process for oxidizing formaldehyde to carbon dioxide and water without the addition of energy. A mixture of formaldehyde and an oxidizing agent (e.g., ambient air containing formaldehyde) is exposed to a catalyst which includes a noble metal dispersed on a metal oxide which possesses more than one oxidation state. Especially good results are obtained when the noble metal is platinum, and the metal oxide which possesses more than one oxidation state is tin oxide. A promoter (i.e., a small amount of an oxide of a transition series metal) may be used in association with the tin oxide to provide very beneficial results.\n
## 2 Controlled molecular weight imide oligomers and co-oligomers containing pendent phenylethynyl groups (PEPIs) and endcapped with nonreactive or phenylethynyl groups have been prepared by the cyclodehydration of the precursor amide acid oligomers or co-oligomers containing pendent phenylethynyl groups and endcapped with nonreactive or phenylethynyl groups. The amine terminated amide acid oligomers or co-oligomers are prepared from the reaction of dianhydride(s) with an excess of diamine(s) and diamine containing pendent phenylethynyl groups and subsequently endcapped with a phenylethynyl phthalic anhydride or monofunctional anhydride. The anhydride terminated amide acid oligomers and co-oligomers are prepared from the reaction of diamine(s) and diamine containing pendent phenylethynyl group(s) with an excess of dianhydride(s) and subsequently endcapped with a phenylethynyl amine or monofunctional amine. The polymerizations are carried out in polar aprotic solvents such as under nitrogen at room temperature. The amide acid oligomers or co-oligomers are subsequently cyclodehydrated to the corresponding imide oligomers. The polymers and copolymers prepared from these materials exhibit a unique and unexpected combination of properties.\n
## 3 Non-invasive measuring devices responsive to changes in a patient's intracranial pressure (ICP) can be accurately calibrated for monitoring purposes by providing known changes in ICP by non-invasive methods, such as placing the patient on a tilting bed and calculating a change in ICP from the tilt angle and the length of the patient's cerebrospinal column, or by placing a pressurized skull cap on the patient and measuring the inflation pressure. Absolute values for the patient's pressure-volume index (PVI) and the steady state ICP can then be determined by inducing two known changes in the volume of cerebrospinal fluid while recording the corresponding changes in ICP by means of the calibrated measuring device. The two pairs of data for pressure change and volume change are entered into an equation developed from an equation describing the relationship between ICP and cerebrospinal fluid volume. PVI and steady state ICP are then determined by solving the equation. Methods for inducing known changes in cerebrospinal fluid volume are described.\n
## 4 A method for forming ferroelectric wafers is provided. A prestress layer is placed on the desired mold. A ferroelectric wafer is placed on top of the prestress layer. The layers are heated and then cooled, causing the ferroelectric wafer to become prestressed. The prestress layer may include reinforcing material and the ferroelectric wafer may include electrodes or electrode layers may be placed on either side of the ferroelectric layer. Wafers produced using this method have greatly improved output motion.\n
## 5 A quasi four-level solid-state laser is provided. A laser crystal is disposed in a laser cavity. The laser crystal has a LuAG-based host material doped to a final concentration between about 2% and about 7% thulium (Tm) ions. For the more heavily doped final concentrations, the LuAG-based host material is a LuAG seed crystal doped with a small concentration of Tm ions. Laser diode arrays are disposed transversely to the laser crystal for energizing the Tm ions.\n
## origin
## 1 (No Model.) A. I. CAMPBELL.
## 2 The invention described herein was jointly made by employees of the United States Government, contract employees during the performance of work under a NASA contract which is subject to the provisions of Public Law 95-517 (35 USC 202) in which the contractor has elected not to retain title, and an employee of Rochester Gas and Electric Corporation during the performance of work under a Memorandum of Agreement.
## 3 This invention described herein was made by employees of the United States Government and may be manufactured and used by or for the Government or government purposes without payment of any royalties therein or thereof.
## 4 The invention described herein was made in the performance of work done by employees of the U.S. Government and may be manufactured and used by or for the government for governmental purposes without the payment of any royalties thereon or therefor.
## 5 The invention described herein was made by employees of the United States Government and may be used by and for the Government for governmental purposes without the payment of any royalties thereon or therefor.
## background
## 1 No. 464,750. Patented Dec.8,1891.
## 2 This invention relates generally to oxidizing formaldehyde. It relates particularly to a process for oxidizing formaldehyde to carbon dioxide and water, which process includes exposing a gaseous mixture containing formaldehyde and an oxidizing agent to a catalyst of a noble metal dispersed on a metal oxide possessing more than one stable oxidation state.
## 3 The synthesis and characterization of PI has been extensively studied and documented. Reviews on PI are available. [J. W. Verbicky, Jr., "Polyimides" in Encyclopedia of Polymer Science and Engineering, 2nd Ed., John Wiley and Sons, New York, Vol. 12, 364 (1988); C. E. Sroog, Prog. Polym. Sci., 16, 591 (1991)].
## 4 1. Field of The invention
## 5 The present invention relates generally to ferroelectric devices, and more particularly to ferroelectric devices providing large mechanical output displacements.
## citedBy citations filingDate
## 1 4 0 1891-12-08
## 2 30 7 1995-03-30
## 3 20 3 1995-08-04
## 4 59 17 1994-08-25
## 5 140 2 1995-04-04
raw.df$filingDate <- as.Date(raw.df$filingDate, format = "%Y-%m-%d")
# drop outlier date
raw.df <- raw.df %>%
subset(filingDate != as.Date("1891-12-08"))
# date quartiles
quartile <- (max(raw.df$filingDate) - min(raw.df$filingDate)) / 4
q1 <- min(raw.df$filingDate) + quartile
q2 <- min(raw.df$filingDate) + (quartile*2)
q3 <- min(raw.df$filingDate) + (quartile*3)
q4 <- max(raw.df$filingDate)
# citation count per patent filed between 2006 and 2012
citations4 <- raw.df %>%
select(citations, citedBy, title, filingDate) %>%
filter(filingDate >= as.Date(q3) & filingDate <= as.Date(q4))
# citation count per patent filed between 2000 and 2006
citations3 <- raw.df %>%
select(citations, citedBy, title, filingDate) %>%
filter(filingDate >= as.Date(q2) & filingDate <= as.Date(q3))
# citation count per patent filed between 1994 and 2000
citations2 <- raw.df %>%
select(citations, citedBy, title, filingDate) %>%
filter(filingDate >= as.Date(q1) & filingDate <= as.Date(q2))
# citation count per patent filed before 1994
citations1 <- raw.df %>%
select(citations, citedBy, title, filingDate) %>%
filter(filingDate <= as.Date(q1))# patent density -- which years most patents?
ggplot(raw.df, aes(x=filingDate)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666") +
ggtitle("Patent density by year") +
xlab("year") + ylab("density")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# citations and citedBy count over all years
c1 <- ggplot(raw.df, aes(x=filingDate, y=citedBy)) +
geom_point() +
ggtitle("(n) cited by other patents by year") +
xlab("year") + ylab("cited by")
c2 <- ggplot(raw.df, aes(x=filingDate, y=citations)) +
geom_point() +
ggtitle("(n) citations by year") +
xlab("year") + ylab("citations")
plot_grid(c1, c2, labels = c('A', 'B'), label_size = 12)# citations count by year quartiles (zoomed in)
p1 <- ggplot(citations1, aes(x=filingDate, y=citations)) +
geom_point() +
#stat_summary(fun.data=mean_cl_normal) +
#geom_smooth(method='lm', formula= y~x) +
ggtitle("(n) citation counts, 2006-2012") +
xlab("year") + ylab("citations")
p2 <- ggplot(citations2, aes(x=filingDate, y=citations)) +
geom_point() +
ggtitle("(n) citation counts, 2000-2006") +
xlab("year") + ylab("citations")
p3 <- ggplot(citations3, aes(x=filingDate, y=citations)) +
geom_point() +
ggtitle("(n) citation counts, 1994-2000") +
xlab("year") + ylab("citations")
p4 <- ggplot(citations4, aes(x=filingDate, y=citations)) +
geom_point() +
ggtitle("(n) citation counts, pre-1994") +
xlab("year") + ylab("citations")
plot_grid(p1, p2, p3, p4, labels = c('A', 'B', 'C', 'D'), label_size = 12)Examine the patent text files and view the text sections. We note 932 observations, 8 variables, and 2 integer classes and 6 character classes.
patents <- read.csv("data/patent_text.csv", header = TRUE, stringsAsFactors = FALSE)
glimpse(patents)## Observations: 932
## Variables: 8
## $ appId <chr> "US464750A", "US5585083A", "US5606014A", "US5617873A"…
## $ title <chr> "Button-hole-scissors gage - Google Patents", "Catal…
## $ abstract <chr> "Disclosed is a process for oxidizing formaldehyde to…
## $ origin <chr> "(No Model.) A. I. CAMPBELL. ", "The invention descri…
## $ background <chr> "No. 464,750. Patented Dec.8,1891. ", "This invention…
## $ citedBy <int> 4, 30, 20, 59, 140, 8, 47, 10, 19, 96, 17, 12, 39, 32…
## $ citations <int> 0, 7, 3, 17, 2, 1, 9, 1, 11, 8, 21, 16, 37, 13, 3, 11…
## $ filingDate <chr> "1891-12-08", "1995-03-30", "1995-08-04", "1994-08-25…
| appId | title | abstract | origin | background | citedBy | citations | filingDate |
|---|---|---|---|---|---|---|---|
| US464750A | Button-hole-scissors gage - Google Patents | Disclosed is a process for oxidizing formaldehyde to carbon dioxide and water without the addition of energy. A mixture of formaldehyde and an oxidizing agent (e.g., ambient air containing formaldehyde) is exposed to a catalyst which includes a noble metal dispersed on a metal oxide which possesses more than one oxidation state. Especially good results are obtained when the noble metal is platinum, and the metal oxide which possesses more than one oxidation state is tin oxide. A promoter (i.e., a small amount of an oxide of a transition series metal) may be used in association with the tin oxide to provide very beneficial results. | (No Model.) A. I. CAMPBELL. | No. 464,750. Patented Dec.8,1891. | 4 | 0 | 1891-12-08 |
| US5585083A | Catalytic process for formaldehyde oxidation | Controlled molecular weight imide oligomers and co-oligomers containing pendent phenylethynyl groups (PEPIs) and endcapped with nonreactive or phenylethynyl groups have been prepared by the cyclodehydration of the precursor amide acid oligomers or co-oligomers containing pendent phenylethynyl groups and endcapped with nonreactive or phenylethynyl groups. The amine terminated amide acid oligomers or co-oligomers are prepared from the reaction of dianhydride(s) with an excess of diamine(s) and diamine containing pendent phenylethynyl groups and subsequently endcapped with a phenylethynyl phthalic anhydride or monofunctional anhydride. The anhydride terminated amide acid oligomers and co-oligomers are prepared from the reaction of diamine(s) and diamine containing pendent phenylethynyl group(s) with an excess of dianhydride(s) and subsequently endcapped with a phenylethynyl amine or monofunctional amine. The polymerizations are carried out in polar aprotic solvents such as under nitrogen at room temperature. The amide acid oligomers or co-oligomers are subsequently cyclodehydrated to the corresponding imide oligomers. The polymers and copolymers prepared from these materials exhibit a unique and unexpected combination of properties. | The invention described herein was jointly made by employees of the United States Government, contract employees during the performance of work under a NASA contract which is subject to the provisions of Public Law 95-517 (35 USC 202) in which the contractor has elected not to retain title, and an employee of Rochester Gas and Electric Corporation during the performance of work under a Memorandum of Agreement. | This invention relates generally to oxidizing formaldehyde. It relates particularly to a process for oxidizing formaldehyde to carbon dioxide and water, which process includes exposing a gaseous mixture containing formaldehyde and an oxidizing agent to a catalyst of a noble metal dispersed on a metal oxide possessing more than one stable oxidation state. | 30 | 7 | 1995-03-30 |
Check to make sure we are not operating on empty data. We are not.
## [1] TRUE
Track our documents in case we need to make extra rows for each word
#patents <- patents %>%
#mutate(docId=as.integer(row.names(.)))
`%notin%` <- Negate(`%in%`)
if('docId' %notin% colnames(.)) patents<-add_column(patents, docId=as.integer(row.names(patents)), .before = 1)
head(patents, n=1)## docId appId title
## 1 1 US464750A Button-hole-scissors gage - Google Patents
## abstract
## 1 Disclosed is a process for oxidizing formaldehyde to carbon dioxide and water without the addition of energy. A mixture of formaldehyde and an oxidizing agent (e.g., ambient air containing formaldehyde) is exposed to a catalyst which includes a noble metal dispersed on a metal oxide which possesses more than one oxidation state. Especially good results are obtained when the noble metal is platinum, and the metal oxide which possesses more than one oxidation state is tin oxide. A promoter (i.e., a small amount of an oxide of a transition series metal) may be used in association with the tin oxide to provide very beneficial results.\n
## origin background citedBy
## 1 (No Model.) A. I. CAMPBELL. No. 464,750. Patented Dec.8,1891. 4
## citations filingDate
## 1 0 1891-12-08
References 1. https://github.com/tidyverse/dplyr/issues/2047 2. https://tibble.tidyverse.org/reference/add_column.html 3. https://www.r-bloggers.com/the-notin-operator/
## [1] "Button-hole-scissors gage - Google Patents"
patents <- patents %>%
#head(n=1) %>%
mutate(title = str_remove_all(title, " - Google Patents"))
head(patents$title, n=1)## [1] "Button-hole-scissors gage "
patents <- patents %>%
mutate(title=iconv(title, to='ASCII//TRANSLIT')) %>%
mutate(title=tolower(title)) %>%
mutate(title=str_replace_all(title,'\\w*\\d\\w*','')) %>%
mutate(title=str_replace_all(title,'\\b(apparatus|article|control|device|include.*|provide*|methods*|systems*)\\b','')) %>%
mutate(abstract=iconv(abstract, to='ASCII//TRANSLIT')) %>%
mutate(abstract=tolower(abstract)) %>%
mutate(abstract=str_replace_all(abstract,'\\w*\\d\\w*','')) %>%
mutate(abstract=str_replace_all(abstract,'\\b(apparatus|article|control|device|include.*|provide.*|methods*|systems*)\\b','')) %>%
mutate_if(is.character, str_squish)
head(patents, n=2)## docId appId title
## 1 1 US464750A button-hole-scissors gage
## 2 2 US5585083A catalytic process for formaldehyde oxidation
## abstract
## 1 disclosed is a process for oxidizing formaldehyde to carbon dioxide and water without the addition of energy. a mixture of formaldehyde and an oxidizing agent (e.g., ambient air containing formaldehyde) is exposed to a catalyst which .
## 2 controlled molecular weight imide oligomers and co-oligomers containing pendent phenylethynyl groups (pepis) and endcapped with nonreactive or phenylethynyl groups have been prepared by the cyclodehydration of the precursor amide acid oligomers or co-oligomers containing pendent phenylethynyl groups and endcapped with nonreactive or phenylethynyl groups. the amine terminated amide acid oligomers or co-oligomers are prepared from the reaction of dianhydride(s) with an excess of diamine(s) and diamine containing pendent phenylethynyl groups and subsequently endcapped with a phenylethynyl phthalic anhydride or monofunctional anhydride. the anhydride terminated amide acid oligomers and co-oligomers are prepared from the reaction of diamine(s) and diamine containing pendent phenylethynyl group(s) with an excess of dianhydride(s) and subsequently endcapped with a phenylethynyl amine or monofunctional amine. the polymerizations are carried out in polar aprotic solvents such as under nitrogen at room temperature. the amide acid oligomers or co-oligomers are subsequently cyclodehydrated to the corresponding imide oligomers. the polymers and copolymers prepared from these materials exhibit a unique and unexpected combination of properties.
## origin
## 1 (No Model.) A. I. CAMPBELL.
## 2 The invention described herein was jointly made by employees of the United States Government, contract employees during the performance of work under a NASA contract which is subject to the provisions of Public Law 95-517 (35 USC 202) in which the contractor has elected not to retain title, and an employee of Rochester Gas and Electric Corporation during the performance of work under a Memorandum of Agreement.
## background
## 1 No. 464,750. Patented Dec.8,1891.
## 2 This invention relates generally to oxidizing formaldehyde. It relates particularly to a process for oxidizing formaldehyde to carbon dioxide and water, which process includes exposing a gaseous mixture containing formaldehyde and an oxidizing agent to a catalyst of a noble metal dispersed on a metal oxide possessing more than one stable oxidation state.
## citedBy citations filingDate
## 1 4 0 1891-12-08
## 2 30 7 1995-03-30
patents_stats <- patents %>%
rowwise() %>%
transmute(title = title, title_nchar = nchar(title), title_nmeans= nchar(title),
abstract=abstract, abstract_nchar = nchar(abstract), abstract_nmeans= nchar(abstract),
origin = origin, origin_nchar = nchar(origin), origin_nmeans= nchar(origin),
background = background, background_nchar = nchar(background), background_nmeans= nchar(background)) %>%
mutate(title_nmeans=colMeans(.[3]),
abs_nmeans=colMeans(.[6]),
origin_nmeans=colMeans(.[9]),
background_nmeans=colMeans(.[12])) %>%
cbind(select(patents, -c(title, abstract, origin, background)),.)
head(patents_stats, n=2)## docId appId citedBy citations filingDate
## 1 1 US464750A 4 0 1891-12-08
## 2 2 US5585083A 30 7 1995-03-30
## title title_nchar title_nmeans
## 1 button-hole-scissors gage 25 NA
## 2 catalytic process for formaldehyde oxidation 44 NA
## abstract
## 1 disclosed is a process for oxidizing formaldehyde to carbon dioxide and water without the addition of energy. a mixture of formaldehyde and an oxidizing agent (e.g., ambient air containing formaldehyde) is exposed to a catalyst which .
## 2 controlled molecular weight imide oligomers and co-oligomers containing pendent phenylethynyl groups (pepis) and endcapped with nonreactive or phenylethynyl groups have been prepared by the cyclodehydration of the precursor amide acid oligomers or co-oligomers containing pendent phenylethynyl groups and endcapped with nonreactive or phenylethynyl groups. the amine terminated amide acid oligomers or co-oligomers are prepared from the reaction of dianhydride(s) with an excess of diamine(s) and diamine containing pendent phenylethynyl groups and subsequently endcapped with a phenylethynyl phthalic anhydride or monofunctional anhydride. the anhydride terminated amide acid oligomers and co-oligomers are prepared from the reaction of diamine(s) and diamine containing pendent phenylethynyl group(s) with an excess of dianhydride(s) and subsequently endcapped with a phenylethynyl amine or monofunctional amine. the polymerizations are carried out in polar aprotic solvents such as under nitrogen at room temperature. the amide acid oligomers or co-oligomers are subsequently cyclodehydrated to the corresponding imide oligomers. the polymers and copolymers prepared from these materials exhibit a unique and unexpected combination of properties.
## abstract_nchar abstract_nmeans
## 1 235 235
## 2 1249 1249
## origin
## 1 (No Model.) A. I. CAMPBELL.
## 2 The invention described herein was jointly made by employees of the United States Government, contract employees during the performance of work under a NASA contract which is subject to the provisions of Public Law 95-517 (35 USC 202) in which the contractor has elected not to retain title, and an employee of Rochester Gas and Electric Corporation during the performance of work under a Memorandum of Agreement.
## origin_nchar origin_nmeans
## 1 27 242.8637
## 2 413 242.8637
## background
## 1 No. 464,750. Patented Dec.8,1891.
## 2 This invention relates generally to oxidizing formaldehyde. It relates particularly to a process for oxidizing formaldehyde to carbon dioxide and water, which process includes exposing a gaseous mixture containing formaldehyde and an oxidizing agent to a catalyst of a noble metal dispersed on a metal oxide possessing more than one stable oxidation state.
## background_nchar background_nmeans abs_nmeans
## 1 33 368.5944 NA
## 2 356 368.5944 NA
Store the dates in date format yyyy-mm-dd.
patents_stats <- patents_stats %>%
rowwise() %>%
mutate(filingDate=ymd(filingDate))
head(patents_stats, n=1)## Source: local data frame [1 x 18]
## Groups: <by row>
##
## # A tibble: 1 x 18
## docId appId citedBy citations filingDate title title_nchar title_nmeans
## <int> <chr> <int> <int> <date> <chr> <int> <dbl>
## 1 1 US46… 4 0 1891-12-08 butt… 25 NA
## # … with 10 more variables: abstract <chr>, abstract_nchar <int>,
## # abstract_nmeans <int>, origin <chr>, origin_nchar <int>,
## # origin_nmeans <dbl>, background <chr>, background_nchar <int>,
## # background_nmeans <dbl>, abs_nmeans <dbl>
References: 1. https://dplyr.tidyverse.org/reference/mutate.html#grouped-tibbles 2. https://dplyr.tidyverse.org/reference/summarise.html 3. https://github.com/tidyverse/dplyr/issues/2838 4. https://stackoverflow.com/questions/43897844/r-move-column-to-last-using-dplyr
After looking into the patent stats, and transforming the data, we choose title and abstract for our Data Analysis.
title_words <- tb_titles %>%
count(word, sort = TRUE)
title_words %>%
top_n(10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill=word)) +
geom_col() +
xlab(NULL) +
coord_flip() +
ylab("Word Frequency") +
ggtitle("Most Common Title Words") ## # A tibble: 5 x 2
## word n
## <chr> <int>
## 1 carbon 44
## 2 sensor 39
## 3 based 35
## 4 composite 29
## 5 sensing 29
title_word_counts <- tb_titles %>%
count(docId, word, sort = TRUE)%>%
ungroup()
head(title_word_counts, 5)## # A tibble: 5 x 3
## docId word n
## <int> <chr> <int>
## 1 203 ester 3
## 2 409 ester 3
## 3 711 flow 3
## 4 3 oligomers 2
## 5 4 pressure 2
abstract_words <- tb_abstracts %>%
count(word, sort = TRUE)
abstract_words %>%
top_n(10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill=word)) +
geom_col() +
xlab(NULL) +
coord_flip() +
ylab("Word Frequency") +
ggtitle("Most Common Abstract Words") ## # A tibble: 5 x 2
## word n
## <chr> <int>
## 1 material 168
## 2 surface 152
## 3 invention 148
## 4 signal 138
## 5 plurality 128
abstract_word_counts <- tb_abstracts %>%
count(docId, word, sort = TRUE)%>%
ungroup()
head(abstract_word_counts, 5)## # A tibble: 5 x 3
## docId word n
## <int> <chr> <int>
## 1 702 image 19
## 2 553 optical 17
## 3 787 flight 16
## 4 879 beam 16
## 5 879 sample 13
References: 1. https://stackoverflow.com/questions/20495598/replace-accented-characters-in-r-with-non-accented-counterpart-utf-8-encoding 2. https://www.tidytextmining.com/tidytext.html
## <<DocumentTermMatrix (documents: 932, terms: 1973)>>
## Non-/sparse entries: 5003/1833833
## Sparsity : 100%
## Maximal term length: NA
## Weighting : term frequency (tf)
## <<DocumentTermMatrix (documents: 878, terms: 5037)>>
## Non-/sparse entries: 18996/4403490
## Sparsity : 100%
## Maximal term length: NA
## Weighting : term frequency (tf)
# Disable to avoid unintentional overwrites
abstract_lda <- LDA(abstract_dtm, k = 24, control = list(seed = 1234))
abstract_lda## A LDA_VEM topic model with 24 topics.
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2578809 137.8 4197022 224.2 NA 4197022 224.2
## Vcells 5174493 39.5 10146329 77.5 32768 7712659 58.9
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2578700 137.8 4197022 224.2 NA 4197022 224.2
## Vcells 5174320 39.5 10146329 77.5 32768 7712659 58.9
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2578708 137.8 4197022 224.2 NA 4197022 224.2
## Vcells 5174351 39.5 10146329 77.5 32768 7712659 58.9
tidy_abstract_lda <- tidy(abstract_lda)
top_abstract_terms <- tidy_abstract_lda %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)top_abstract_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_x_reordered() +
labs(title = "Top 10 terms in each LDA topic",
x = NULL, y = expression(beta)) +
facet_wrap(~ topic, ncol = 4, scales = "free")Our LDA shows an even distribution of probability across many documents.
lda_gamma <- tidy(abstract_lda, matrix = "gamma")
ggplot(lda_gamma, aes(gamma)) +
geom_histogram() +
scale_y_log10() +
labs(title = "Distribution of probabilities for all topics",
y = "Number of documents", x = expression(gamma))ggplot(lda_gamma, aes(gamma, fill = as.factor(topic))) +
geom_histogram(show.legend = FALSE) +
facet_wrap(~ topic, ncol = 4) +
scale_y_log10() +
labs(title = "Distribution of probability for each topic",
y = "Number of documents", x = expression(gamma))Trying another strategy, we can boost the number of topics and then extract the top term per document. Other strategies include creating bi- or tri-grams or combining TF-IDF and other Keywords such as those available in tidytextmining from NASA with the LDA topics or associated words.
Here we try to increase the topics to 100 (roughly 1/8 of the documents).
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2598373 138.8 4197022 224.2 NA 4197022 224.2
## Vcells 5559068 42.5 12255594 93.6 32768 8690708 66.4
## A LDA_VEM topic model with 100 topics.
topics <- tidy(abstract_lda, matrix = "beta")
topics %>%
mutate(term = reorder(term, beta)) %>%
# Need to limit number of terms
group_by(term) %>%
top_n(3, beta) %>%
ungroup() %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
labs(x = NULL, y = "Beta") +
coord_flip() abs_topics_term <- topics %>%
group_by(topic) %>%
filter(beta == max(beta)) %>%
slice(1) %>%
ungroup()
head(abs_topics_term,n=5)## # A tibble: 5 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 conductive 0.0507
## 2 2 material 0.0462
## 3 3 inflatable 0.0591
## 4 4 signal 0.0450
## 5 5 single 0.0246
Merge and get one topic per document
abs_topic_docs <- tidy(abstract_lda, matrix = "gamma")
topics_ <- abs_topic_docs %>%
mutate(docId=as.integer(document)) %>%
select(., -document) %>%
group_by(docId) %>%
filter(gamma == max(gamma)) %>%
slice(1) %>%
ungroup()
abs_topic_docs## # A tibble: 87,800 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 702 1 0.0000718
## 2 553 1 0.0000539
## 3 787 1 0.0000744
## 4 879 1 0.0000642
## 5 261 1 0.000122
## 6 278 1 0.0000521
## 7 279 1 0.0000649
## 8 432 1 0.0000940
## 9 2 1 0.0000678
## 10 7 1 0.0000642
## # … with 87,790 more rows
## # A tibble: 878 x 3
## topic gamma docId
## <int> <dbl> <int>
## 1 43 0.969 1
## 2 62 0.993 2
## 3 75 0.993 3
## 4 11 0.556 4
## 5 19 0.599 5
## 6 64 0.989 6
## 7 62 0.994 7
## 8 37 0.714 8
## 9 72 0.995 9
## 10 41 0.850 10
## # … with 868 more rows
Get one term per topic
## Warning in topic == topics_$docId: longer object length is not a multiple
## of shorter object length
## Warning: Length of logical index must be 1 or 100, not 878
topic_terms <- inner_join(topics_, terms_, by = "topic") # join subset and df of topic per doc by "topic"
terms_## # A tibble: 12 x 2
## topic term
## <int> <chr>
## 1 1 conductive
## 2 2 material
## 3 3 inflatable
## 4 4 signal
## 5 5 single
## 6 6 layer
## 7 7 connecting
## 8 8 rotor
## 9 9 band
## 10 10 drill
## 11 11 ratio
## 12 12 data
## # A tibble: 87 x 4
## topic gamma docId term
## <int> <dbl> <int> <chr>
## 1 11 0.556 4 ratio
## 2 6 0.994 16 layer
## 3 11 0.989 24 ratio
## 4 3 0.989 56 inflatable
## 5 3 0.790 67 inflatable
## 6 2 0.988 81 material
## 7 3 0.989 100 inflatable
## 8 9 0.990 102 band
## 9 1 0.993 109 conductive
## 10 4 0.472 121 signal
## # … with 77 more rows
Check for errors
## [1] FALSE
Merge to patent document df
patent_topics <- merge(patents, topic_terms, by = "docId") %>%
group_by(docId) %>% arrange(., docId)
head(patent_topics)## # A tibble: 6 x 12
## # Groups: docId [6]
## docId appId title abstract origin background citedBy citations filingDate
## <int> <chr> <chr> <chr> <chr> <chr> <int> <int> <chr>
## 1 4 US56… non-… a for f… The i… 1. Field … 59 17 1994-08-25
## 2 16 US57… diff… an and … The i… Periodont… 27 11 1996-09-09
## 3 24 US58… quan… an auto… The i… "Tumor ce… 8 7 1993-07-27
## 4 56 US62… endo… an infl… This … The inven… 27 9 1996-04-17
## 5 67 US63… and … a porta… This … 1. Techni… 46 2 1999-02-02
## 6 81 US64… high… cartesi… The i… The prese… 17 5 2000-08-22
## # … with 3 more variables: topic <int>, gamma <dbl>, term <chr>
Extract
Reference: 1. https://www.tidytextmining.com/tidytext.html 2. https://www.kaggle.com/rtatman/nlp-in-r-topic-modelling#Unsupervised-topic-modeling-with-LDA
Measures the importance of a word within the corpus, potentially surfacing not unique words to the document (tf) but those that are not very frequent in the corpus (IDF). We investigate this potentially more reliable method (given the small dataset (corpus)) until we are able to expand or augment the LDA topic analysis and then test it.
abstracts_tf_idf <- tb_abstracts %>%
count(docId, word, sort = TRUE) %>%
ungroup() %>%
bind_tf_idf(word, docId, n)
abstracts_tf_idf %>%
top_n(10) %>%
ggplot(aes(word, tf_idf, fill=word)) +
geom_col() +
xlab(NULL) +
coord_flip() +
ylab("Word TF-IDF") +
ggtitle("Sample of Most Relevant, Unique Abstract Words in Corpus") doc_abs_tf_idf <- abstracts_tf_idf %>%
group_by(docId) %>%
filter(tf_idf == max(tf_idf)) %>%
slice(1) %>%
ungroup()
head(doc_abs_tf_idf,n=5)## # A tibble: 5 x 6
## docId word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 1 formaldehyde 3 0.158 6.08 0.961
## 2 2 oligomers 11 0.124 5.17 0.639
## 3 3 icp 7 0.0864 5.39 0.466
## 4 4 ferroelectric 1 0.333 5.39 1.80
## 5 5 quasi 1 0.25 5.68 1.42
Merge
doc_terms <- doc_abs_tf_idf %>% transmute(docId=docId, term=word)
patents_terms <- merge(doc_terms, patents, by='docId')
kable(head(patents_terms, n=2))| docId | term | appId | title | abstract | origin | background | citedBy | citations | filingDate |
|---|---|---|---|---|---|---|---|---|---|
| 1 | formaldehyde | US464750A | button-hole-scissors gage | disclosed is a process for oxidizing formaldehyde to carbon dioxide and water without the addition of energy. a mixture of formaldehyde and an oxidizing agent (e.g., ambient air containing formaldehyde) is exposed to a catalyst which . | (No Model.) A. I. CAMPBELL. | No. 464,750. Patented Dec.8,1891. | 4 | 0 | 1891-12-08 |
| 2 | oligomers | US5585083A | catalytic process for formaldehyde oxidation | controlled molecular weight imide oligomers and co-oligomers containing pendent phenylethynyl groups (pepis) and endcapped with nonreactive or phenylethynyl groups have been prepared by the cyclodehydration of the precursor amide acid oligomers or co-oligomers containing pendent phenylethynyl groups and endcapped with nonreactive or phenylethynyl groups. the amine terminated amide acid oligomers or co-oligomers are prepared from the reaction of dianhydride(s) with an excess of diamine(s) and diamine containing pendent phenylethynyl groups and subsequently endcapped with a phenylethynyl phthalic anhydride or monofunctional anhydride. the anhydride terminated amide acid oligomers and co-oligomers are prepared from the reaction of diamine(s) and diamine containing pendent phenylethynyl group(s) with an excess of dianhydride(s) and subsequently endcapped with a phenylethynyl amine or monofunctional amine. the polymerizations are carried out in polar aprotic solvents such as under nitrogen at room temperature. the amide acid oligomers or co-oligomers are subsequently cyclodehydrated to the corresponding imide oligomers. the polymers and copolymers prepared from these materials exhibit a unique and unexpected combination of properties. | The invention described herein was jointly made by employees of the United States Government, contract employees during the performance of work under a NASA contract which is subject to the provisions of Public Law 95-517 (35 USC 202) in which the contractor has elected not to retain title, and an employee of Rochester Gas and Electric Corporation during the performance of work under a Memorandum of Agreement. | This invention relates generally to oxidizing formaldehyde. It relates particularly to a process for oxidizing formaldehyde to carbon dioxide and water, which process includes exposing a gaseous mixture containing formaldehyde and an oxidizing agent to a catalyst of a noble metal dispersed on a metal oxide possessing more than one stable oxidation state. | 30 | 7 | 1995-03-30 |
Extract
Reference: 1. https://www.tidytextmining.com/tidytext.html
The code chunk for the USPTO acquistion shall be used when documents beyond the Google archive are needed.
# Example https://developer.uspto.gov/ibd-api/v1/patent/application?applicationNumber=US12795356'
# Requires nasa_patents objects
for (app in nasa_patents$Application_SN_uspto) {
#get_patent_application_uspto(app)
url <- paste0('https://developer.uspto.gov/ibd-api/v1/patent/application?applicationNumber=',app)
path <- "data/uspto"
file_path <- file.path(path, str_c(app, ".html"))
raw.df <- read.csv("data/patent_text.csv", as.is = TRUE)
raw.df$filingDate <- as.Date(raw.df$filingDate, format = "%Y-%m-%d")
}We didn’t use the USPTO source. We acquired patent data from Google, instead. This code chunk for this acquistion is disabled because I don’t understand what it retrieves, but I’ll keep the code in case I can use it later.
# Curl
# curl -X GET --header 'Accept: application/json' 'https://developer.uspto.gov/ibd-api/v1/patent/application?applicationNumber=US14202699%2CUS12795356&start=0&rows=100'
# Request URL
# https://developer.uspto.gov/ibd-api/v1/patent/application?applicationNumber=US14202699%2CUS12795356&start=0&rows=100
if (!file.exists(file_path)) {
patent_application <- jsonlite::fromJSON(url)
#test
print(patent_application)
# Bulk download not ideal, disable download from USPTO
##download.file(patent_application$response$docs$archiveUrl, file.path(path))
# Instead use the following form to query for new HTML files
# http://patft.uspto.gov/netahtml/PTO/index.html
# Example: http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=5963949.PN.&OS=PN/5963949&RS=PN/5963949
break
}
}
print("Downloads complete.")## [1] "app_country" "app_date" "app_number"
## [4] "app_type" "appcit_app_number" "appcit_category"
result <- search_pv(
query = qry_funs$eq(patent_number = "8293178"),
fields = get_fields(endpoint = "patents", groups = c("patents", "inventors"))
)
result## $data
## #### A list with a single data frame (with list column(s) inside) on a patent level:
##
## List of 1
## $ patents:'data.frame': 1 obs. of 32 variables:
## ..$ detail_desc_length : chr "21556"
## ..$ patent_abstract : chr "A chemochromic sensor"..
## ..$ patent_average_processing_time : chr "1590"
## ..$ patent_date : chr "2012-10-23"
## ..$ patent_firstnamed_assignee_city : chr "Washington"
## ..$ patent_firstnamed_assignee_country : chr "US"
## ..$ patent_firstnamed_assignee_id : chr "org_EolmLkaBf9MsnLD1f"..
## ..$ patent_firstnamed_assignee_latitude : chr "38.895"
## ..$ patent_firstnamed_assignee_location_id: chr "38.895|-77.0367"
## ..$ patent_firstnamed_assignee_longitude : chr "-77.0367"
## ..$ patent_firstnamed_assignee_state : chr "DC"
## ..$ patent_firstnamed_inventor_city : chr "Titusville"
## ..$ patent_firstnamed_inventor_country : chr "US"
## ..$ patent_firstnamed_inventor_id : chr "7790787-4"
## ..$ patent_firstnamed_inventor_latitude : chr "28.6119"
## ..$ patent_firstnamed_inventor_location_id: chr "28.6119|-80.8078"
## ..$ patent_firstnamed_inventor_longitude : chr "-80.8078"
## ..$ patent_firstnamed_inventor_state : chr "FL"
## ..$ patent_id : chr "8293178"
## ..$ patent_kind : chr "B2"
## ..$ patent_num_cited_by_us_patents : chr "0"
## ..$ patent_num_claims : chr "14"
## ..$ patent_num_combined_citations : chr "10"
## ..$ patent_num_foreign_citations : chr "0"
## ..$ patent_num_us_application_citations : chr "5"
## ..$ patent_num_us_patent_citations : chr "5"
## ..$ patent_number : chr "8293178"
## ..$ patent_processing_time : chr "1813"
## ..$ patent_title : chr "Chemochromic detector"..
## ..$ patent_type : chr "utility"
## ..$ patent_year : chr "2012"
## ..$ inventors :List of 1
##
## $query_results
## #### Distinct entity counts across all downloadable pages of output:
##
## total_patent_count = 1
## [1] "A chemochromic sensor for detecting a combustible gas, such as hydrogen, includes a chemochromic pigment mechanically mixed with a polymer and formed into a rigid or pliable material. In a preferred embodiment, the chemochromic detector includes aerogel material. The detector is robust and easily modifiable for a variety of applications and environmental conditions, such as atmospheres of inert gas, hydrogen gas, or mixtures of gases, or in environments that have variable temperature, including high temperatures such as above 100° C. and low temperatures such as below −196° C."
Other attributes explored
# prior art keywords
raw.html %>% html_nodes("[itemprop='priorArtKeywords']") %>% html_text()
# inventors
raw.html %>% html_nodes("[itemprop='inventor']") %>% html_text()
# current assignee
raw.html %>% html_nodes("[itemprop='assigneeCurrent']") %>% html_text()
# original assignee
#raw.html %>% html_nodes("[itemprop='assigneeOriginal']") %>% html_text() # lots of noise; need to parse later
# filing date
raw.html %>% html_nodes("[itemprop='filingDate']") %>% html_text() # which one to select?
# classifications
raw.html %>% html_nodes("[itemprop='cpcs']") %>% html_text() # what does it mean?Explore PLOS. Commentary in code chunk. Unhide to review.
## field description
## 10 abstract Abstract section
## 49 abstract_ngram <NA>
## 11 abstract_primary_display Abstract section
## 21 accepted_date Accepted Date
## 38 affiliate Affiliate
## 50 affiliate_facet <NA>
## 5 alternate_title Alternative Title
## 25 article_type Article Type
## 51 article_type_facet <NA>
## 6 author Author
## 52 author_affiliate <NA>
## 9 author_collab_only_display Author
## 7 author_display Author
## 53 author_facet <NA>
## 39 author_notes Author Notes
## 8 author_without_collab_display Author
## 18 body Most sections of the article
## 54 body_ngram <NA>
## 55 body_rev <NA>
## 40 competing_interest Competing Interest Statement
## 15 conclusions Conclusions section
## 45 copyright copyright-statement
## 42 counter_total_all Total views, all time
## 43 counter_total_month Total views, last 30 days
## 48 cross_published_journal_eissn no description
## 47 cross_published_journal_key Cross Published Journal Key
## 46 cross_published_journal_name Cross Published Journal Name
## 56 doc_partial_body <NA>
## 57 doc_partial_parent_id <NA>
## 58 doc_partial_type <NA>
## 59 doc_type <NA>
## 36 editor Editor
## 60 editor_affiliate <NA>
## 37 editor_display Editor
## 61 editor_facet <NA>
## 28 eissn electronic ISSN
## 30 elocation_id Electronic Location
## 2 everything All text in the article
## 62 everything_ngram <NA>
## 63 everything_noprocess <NA>
## 64 everything_rev <NA>
## 65 figure_table_caption <NA>
## 41 financial_disclosure Financial Disclosure Statement
## 1 id DOI (Digital Object Identifier)
## 12 introduction Introduction section
## 24 issue Issue
## 22 journal Full Journal Name
## 32 journal_id_nlm_ta Journal ID at NLM
## 31 journal_id_pmc Journal ID at PMC
## 33 journal_id_publisher Publisher of this Journal
## 13 materials_and_methods Materials and Methods section
## 35 pagecount Total number of pages
## 29 pissn print ISSN
## 19 publication_date Publication Date
## 34 publisher Publisher of this Article
## 20 received_date Received Date
## 17 reference Reference section
## 14 results_and_discussion Results and discussion section
## 26 subject Subject Category
## 66 subject_facet <NA>
## 67 subject_hierarchy <NA>
## 27 subject_level_1 Subject Category
## 68 subject2 <NA>
## 69 subject2_facet <NA>
## 70 subject2_hierarchy <NA>
## 71 subject2_level_1 <NA>
## 16 supporting_information Supporting Information section
## 44 timestamp Time of last index
## 3 title Article Title
## 4 title_display Article Title
## 72 title_ngram <NA>
## 23 volume Volume
## note
## 10 no note
## 49 <NA>
## 11 For display purposes only. Primary abstract only
## 21 Requires start and end date
## 38 Can have multiple values
## 50 <NA>
## 5 no note
## 25 no note
## 51 <NA>
## 6 Can have multiple values
## 52 <NA>
## 9 For display purposes only. Collaborative authors only
## 7 For display purposes only
## 53 <NA>
## 39 no note
## 8 For display purposes only. All the authors except for collaborative authors
## 18 Without Abstract or References
## 54 <NA>
## 55 <NA>
## 40 no note
## 15 no note
## 45 copyright information
## 42 no note
## 43 no note
## 48 PLoS-specific indexes for articles that appear in multiple journals
## 47 PLoS-specific indexes for articles that appear in multiple journals
## 46 PLoS-specific indexes for articles that appear in multiple journals
## 56 <NA>
## 57 <NA>
## 58 <NA>
## 59 <NA>
## 36 Can have multiple values
## 60 <NA>
## 37 For display purposes only.
## 61 <NA>
## 28 no note
## 30 Used by Pub Med Central
## 2 Includes Meta information
## 62 <NA>
## 63 <NA>
## 64 <NA>
## 65 <NA>
## 41 no note
## 1 Extended for partial documents
## 12 no note
## 24 no note
## 22 no note
## 32 Used by the National Library of Medicine
## 31 Used by Pub Med Central
## 33 Short identifier
## 13 no note
## 35 Not all articles have page count
## 29 no note
## 19 Requires start and end date
## 34 Full name
## 20 Requires start and end date
## 17 Can have multiple values
## 14 no note
## 26 Can have multiple values
## 66 <NA>
## 67 <NA>
## 27 Can have multiple values. Contains only the top level subjects.
## 68 <NA>
## 69 <NA>
## 70 <NA>
## 71 <NA>
## 16 no note
## 44 no note
## 3 no note
## 4 For display purposes only
## 72 <NA>
## 23 no note
## [1] "PLoSONE" "PLoSGenetics" "PLoSPathogens"
## [4] "PLoSNTD" "PLoSCompBiol" "PLoSBiology"
## [7] "PLoSMedicine" "PLoSClinicalTrials"
# Search for an author. Return the research paper ID and title.
searchplos(q = "author:sulzer", fl = "id, title")## $meta
## # A tibble: 1 x 2
## numFound start
## <int> <int>
## 1 30 0
##
## $data
## # A tibble: 10 x 2
## id title
## <chr> <chr>
## 1 10.1371/journal.pcbi.1005681 Self-regulation strategy, feedback timin…
## 2 10.1371/journal.pcbi.1005681/… <NA>
## 3 10.1371/journal.pcbi.1005681/… <NA>
## 4 10.1371/journal.pcbi.1005681/… <NA>
## 5 10.1371/journal.pcbi.1005681/… <NA>
## 6 10.1371/journal.pcbi.1005681/… <NA>
## 7 10.1371/journal.pcbi.1005681/… <NA>
## 8 10.1371/journal.pcbi.1005681/… <NA>
## 9 10.1371/journal.pone.0053040 Macroautophagy Abnormality in Essential …
## 10 10.1371/journal.pone.0053040/… <NA>
# Query on a keyword. Return ID, date, title, and abstract.
p <- searchplos(q = "synuclein",
fl = c("id", "publication_date", "title", "abstract"),
limit = 20)
# The number of hits.
p$meta## # A tibble: 1 x 2
## numFound start
## <int> <int>
## 1 1967 0
## # A tibble: 20 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2013-04-22T00:00… "\nAbnormal α-synucle… Molecular Ageing o…
## 2 10.1371/jo… 2014-02-25T00:00… "\nα-Synuclein is the… Differential Expre…
## 3 10.1371/jo… 2011-01-31T00:00… "\n Genetic an… Resistance to MPTP…
## 4 10.1371/jo… 2012-12-31T00:00… "\n α-Synuclei… p62/SQSTM1-Depende…
## 5 10.1371/jo… 2013-04-25T00:00… "α-synuclein dysregul… Alpha-Synuclein In…
## 6 10.1371/jo… 2012-08-08T00:00… "\n Phospholip… γ-Synuclein Intera…
## 7 10.1371/jo… 2011-07-14T00:00… "\n Genetic, b… Assessment of α-Sy…
## 8 10.1371/jo… 2012-12-17T00:00… "\n α-synuclei… α-Synuclein and An…
## 9 10.1371/jo… 2013-02-20T00:00… "\n Cigarette … Human α4β2 Nicotin…
## 10 10.1371/jo… 2010-05-05T00:00… "Background: Melanoma… Parkinson's Diseas…
## 11 10.1371/jo… 2013-01-22T00:00… "\n Amyloid fi… Temperature-Depend…
## 12 10.1371/jo… 2011-12-07T00:00… "\n Alpha-synu… Redistribution of …
## 13 10.1371/jo… 2014-07-07T00:00… "\nSynucleinopathies,… Novel AAV-Based Ra…
## 14 10.1371/jo… 2015-04-06T00:00… "\nThere is unequivoc… Alpha-Synuclein Le…
## 15 10.1371/jo… 2013-05-07T00:00… "\nWhile most forms o… Impairment of Mito…
## 16 10.1371/jo… 2011-10-31T00:00… "\n Recent res… Antibodies against…
## 17 10.1371/jo… 2010-08-11T00:00… "\nThe protein α-synu… α-Synuclein Suppre…
## 18 10.1371/jo… 2009-08-14T00:00… "\nIn synucleinopathi… Parkin Deficiency …
## 19 10.1371/jo… 2017-02-10T00:00… "\nα-Synuclein misfol… α-Synuclein increa…
## 20 10.1371/jo… 2012-04-27T00:00… "\n α-Synuclei… Role of Alpha-Synu…
p <- searchplos(q = "body:phenylethynyl",
fl = c("id", "publication_date", "title", "abstract"))
# The number of hits.
p$meta## # A tibble: 1 x 2
## numFound start
## <int> <int>
## 1 40 0
## # A tibble: 10 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2014-05-23T00:00… "\nWe studied pattern… Structural Probing…
## 2 10.1371/jo… 2014-12-30T00:00… "\nSquare wave voltam… Square Wave Voltam…
## 3 10.1371/jo… 2016-10-17T00:00… "\nPenicillin binding… Oxazin-5-Ones as a…
## 4 10.1371/jo… 2011-06-17T00:00… "\n Highly sel… Design, Synthesis …
## 5 10.1371/jo… 2007-08-22T00:00… Optimization of a ser… Computer-Aided Lea…
## 6 10.1371/jo… 2012-10-23T00:00… "\n Drug toxic… In Situ Mass Spect…
## 7 10.1371/jo… 2014-07-25T00:00… "\nAntagonists of met… The mGluR5 Antagon…
## 8 10.1371/jo… 2008-05-14T00:00… Hippocampal synaptic … MGluR5 Mediates th…
## 9 10.1371/jo… 2017-11-27T00:00… "\nPrion infections c… Inhibition of grou…
## 10 10.1371/jo… 2019-08-21T00:00… "\nInhibitory glycine… mGluR5/ERK signali…
p <- searchplos(q = "body:liquid supersonic cleaning",
fl = c("id", "publication_date", "title", "abstract"))
# The number of hits.
p$meta## # A tibble: 1 x 2
## numFound start
## <int> <int>
## 1 13 0
## # A tibble: 10 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2013-07-16T00:00… "\nWe report on the s… Nanoscale Roughnes…
## 2 10.1371/jo… 2018-06-01T00:00… "\nSugarcane bagasse … A comparative stud…
## 3 10.1371/jo… 2013-11-11T00:00… "\nThe development of… Comparison of Nume…
## 4 10.1371/jo… 2017-04-11T00:00… "\nThe considerable m… Hydrophobic pinnin…
## 5 10.1371/jo… 2014-01-21T00:00… "\nExenatide is an FD… Oral Delivery of E…
## 6 10.1371/jo… 2019-10-24T00:00… "\nArraying individua… Extracellular vesi…
## 7 10.1371/jo… 2016-10-27T00:00… "\nThe Cu-Li-Sn phase… The Cu-Li-Sn Phase…
## 8 10.1371/jo… 2017-06-08T00:00… "\nThe recent episode… Application of aco…
## 9 10.1371/jo… 2012-12-05T00:00… "Objective: Chronic r… Macrophages Facili…
## 10 10.1371/jo… 2012-02-20T00:00… "\n There has … Quantitative Model…
Search for papers whose bodies contain the top words from NASA patent titles.
## Selecting by n
top_title_words <- top_title_words$word
for (word in top_title_words) {
q <- paste0("body:", word)
p <- searchplos(q = q,
fl = c("id", "publication_date", "title", "abstract"))
print(paste0("Search term: ", word))
print(paste0("Count: ", p$meta$numFound))
print(paste0("Sample of documents found..."))
print(head(p$data), n = 5)
}## [1] "Search term: carbon"
## [1] "Count: 29619"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2013-08-26T00:00… "\nChina has been exp… Organic Carbon Stor…
## 2 10.1371/jo… 2013-01-10T00:00… "\n Phenotypic… Coevolution Trumps …
## 3 10.1371/jo… 2012-09-14T00:00… "\n Monitoring… Towards Regional, E…
## 4 10.1371/jo… 2015-03-16T00:00… "\nSoil type and fert… Dynamics of Maize C…
## 5 10.1371/jo… 2016-08-05T00:00… "\nThe alpine grassla… Ecosystem Carbon St…
## # … with 1 more row
## [1] "Search term: sensor"
## [1] "Count: 16289"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2018-08-29T00:00… "\nBuilding predictiv… Infinitely large, r…
## 2 10.1371/jo… 2015-10-23T00:00… "\nMagnetic biosensor… Configurational Sta…
## 3 10.1371/jo… 2018-10-09T00:00… "Background: In diabe… Effect of sensor lo…
## 4 10.1371/jo… 2015-05-07T00:00… "\nDetecting spreadin… Detecting the Influ…
## 5 10.1371/jo… 2014-03-04T00:00… "\nWe address the pro… Feature Selection f…
## # … with 1 more row
## [1] "Search term: based"
## [1] "Count: 229327"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2010-05-10T00:00… "Background: During t… Transcription-Assoc…
## 2 10.1371/jo… 2016-11-02T00:00… "\nLander-Waterman’s … Breaking Lander-Wat…
## 3 10.1371/jo… 2015-12-07T00:00… "\nThe introduction o… Introducing Compute…
## 4 10.1371/jo… 2017-10-27T00:00… "\nThe challenge of d… Hybrid self-optimiz…
## 5 10.1371/jo… 2018-01-31T00:00… "Background: The last… Historical trends i…
## # … with 1 more row
## [1] "Search term: composite"
## [1] "Count: 60646"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2017-01-20T00:00… "\nMussel-inspired ap… Thermal Conductivit…
## 2 10.1371/jo… 2018-04-13T00:00… "Objective: To study … Preparation and cha…
## 3 10.1371/jo… 2012-02-08T00:00… "\n Most biolo… Composite Structura…
## 4 10.1371/jo… 2015-12-29T00:00… "\nAny release of ant… Composite Sampling …
## 5 10.1371/jo… 2018-09-24T00:00… "\nProteins with low-… Proteome-scale rela…
## # … with 1 more row
## [1] "Search term: sensing"
## [1] "Count: 45688"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2015-09-25T00:00… "\nWidely distributed… Heterogeneous Parti…
## 2 10.1371/jo… 2018-03-05T00:00… "\nCell size is thoug… A computational mod…
## 3 10.1371/jo… 2016-07-25T00:00… "\nThe rubber hand il… ‘Robot’ Hand Illusi…
## 4 10.1371/jo… 2012-12-20T00:00… "\n Natural an… A Genome-Wide Inves…
## 5 10.1371/jo… 2011-02-02T00:00… "\n The Arabid… Sense and Antisense…
## # … with 1 more row
## [1] "Search term: optical"
## [1] "Count: 32292"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2016-10-26T00:00… "Purpose: To investig… The Effect of Optic…
## 2 10.1371/jo… 2012-09-04T00:00… "Background: Research… A Novel Animal Mode…
## 3 10.1371/jo… 2015-10-01T00:00… "Purpose: To assess t… Glaucomatous-Type O…
## 4 10.1371/jo… 2007-02-07T00:00… "\n The re… A Functional Archit…
## 5 10.1371/jo… 2009-10-13T00:00… "\nIn this Research A… Dynamic Coupling of…
## # … with 1 more row
## [1] "Search term: process"
## [1] "Count: 189114"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2018-10-23T00:00… "\nOrganizational pro… Development and val…
## 2 10.1371/jo… 2010-12-03T00:00… "\n We present… Feller Processes: T…
## 3 10.1371/jo… 2017-04-27T00:00… "\nMetabolic disorder… Tracking disease pr…
## 4 10.1371/jo… 2019-02-20T00:00… "\nTropidolaemus wagl… Description of cran…
## 5 10.1371/jo… 2009-04-23T00:00… "Background: The trad… Biological Process …
## # … with 1 more row
## [1] "Search term: thermal"
## [1] "Count: 19021"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2017-03-13T00:00… "\nThermal properties… Measurement of ther…
## 2 10.1371/jo… 2011-06-17T00:00… "\n Most organ… Isopods Failed to A…
## 3 10.1371/jo… 2014-12-22T00:00… "\nThermal energy tra… Thermophysical Prop…
## 4 10.1371/jo… 2016-02-03T00:00… "\nClimate change is … Ontogenetic Variati…
## 5 10.1371/jo… 2015-05-18T00:00… "\nThermal conductivi… Large Thermal Condu…
## # … with 1 more row
## [1] "Search term: laser"
## [1] "Count: 22406"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2014-12-26T00:00… "Objective: To evalua… Ranibizumab Monothe…
## 2 10.1371/jo… 2018-09-06T00:00… "\nPicosecond lasers … Effects of picoseco…
## 3 10.1371/jo… 2018-11-29T00:00… "\nThe nucleus accumb… Optogenetic self-st…
## 4 10.1371/jo… 2015-07-10T00:00… "\nThe mouse model of… Optimization of an …
## 5 10.1371/jo… 2013-12-11T00:00… "\nSafe and effective… Near-Infrared Laser…
## # … with 1 more row
## [1] "Search term: monitoring"
## [1] "Count: 89594"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2015-05-29T00:00… "\nThe objective of t… Optimal Design of R…
## 2 10.1371/jo… 2015-05-08T00:00… "\nIn this study, we … Do Parents Meet Ado…
## 3 10.1371/jo… 2013-02-28T00:00… "Objectives: Mortalit… Monitoring of Antir…
## 4 10.1371/jo… 2015-03-20T00:00… "Background: The cost… The Cost-Effectiven…
## 5 10.1371/jo… 2017-08-23T00:00… "\nPatients with Park… Verbal monitoring i…
## # … with 1 more row
## [1] "Search term: nanotube"
## [1] "Count: 532"
## [1] "Sample of documents found..."
## # A tibble: 6 x 4
## id publication_date abstract title
## <chr> <chr> <chr> <chr>
## 1 10.1371/jo… 2013-10-04T00:00… "\nIn this study, Ag … Both Enhanced Bioco…
## 2 10.1371/jo… 2017-04-12T00:00… "\nNanotubes are form… New route for self-…
## 3 10.1371/jo… 2014-01-02T00:00… "\nNature routinely c… Composition Based S…
## 4 10.1371/jo… 2012-05-24T00:00… "\n We present… Carbon Nanotube Sol…
## 5 10.1371/jo… 2006-04-28T00:00… "Here our goal is to … Designing a Nanotub…
## # … with 1 more row
rplos Vignette. Interface to the Solr based search API for PLOS journals. Functions search for articles, retrieve articles, make plots, do faceted searches, highlight searches, and present results of highlighted searches in a browser.fulltext Vignette. Facilitates text mining with an emphasis on open access journals. The vignette provides examples.